#----------------------------------------------------------#
# The Long-Term Impact of High School Financial Education: #
#                 Evidence from Brazil                     #
#                                                          #
#                                                          #
# Analysis of the Natiaonal Household Sample Survey         #
# 2019, quarter 4                                          #
#----------------------------------------------------------#


rm(list=ls())

#set working directory, including data file.
#outputs (generated in the same directory):
# "income.xlsx": income estimates to merge with experiment data
# "Figure3.png": boxplot of income by occupation

setwd(
"replace this with the path for data and results"
)

# Packages required for analysis
library(PNADcIBGE)
library(survey)
library(dplyr)
library(writexl)
library(ggplot2)


#### LOADING DATA ####

# May be downloaded directly from the National Institute of Geography and Statistics.
#   current link:   https://ftp.ibge.gov.br/Trabalho_e_Rendimento/Pesquisa_Nacional_por_Amostra_de_Domicilios_continua/Trimestral/Microdados/2019/

  pnadc <- read_pnadc("PNADC_042019", 
                      "input_PNADC_trimestral.txt", 
                      vars = c("UF",      #State
                               "V1028",   #Weight of household and people (weighted by population projection)
                               "V2007",   #Gender
                               "V2009",   #Age
                               "V2010",   #Race
                               "V4012",   #In that job I was...
                               "V4019",   #CNPJ of main job
                               "VD3004",  #Highest level of education achieved
                               "VD4009",  #Occupational position and job category
                               "VD4020"   #Effective income from all jobs
                      ))
  
  df<- pnadc_design(pnadc)
  
  #State
  df <- update(df,State = ifelse(UF == "17", 
                                  "TO",
                                  ifelse(UF == "23", 
                                         "CE",
                                         ifelse(UF == "31", 
                                                "MG",
                                                ifelse(UF == "33", 
                                                       "RJ",
                                                       ifelse(UF == "35", 
                                                              "SP",
                                                              ifelse(UF == "53", 
                                                                     "DF",
                                                                     "other")))))))
  
  
  #Gender
  df <- update(df,Gender = ifelse(V2007 == "1", 
                                  "M",
                                  ifelse(V2007 == "2", 
                                         "F",
                                         "other")))
  
  
  
  
  # IDENTIFYING PEOPLE IN THE BOLSA FAMÍLIA PROGRAM 
  
  df_PBF<- df
  
  df_PBF$variables<- df_PBF$variables %>% 
    group_by(ID_DOMICILIO) %>%
    summarise(
      rend_dom = sum(VD4020, na.rm = TRUE),
      n_people = n(),
      n_children = sum(V2009 <= 15, na.rm = TRUE),
      n_teenager = sum(V2009 == 16 | V2009 == 17, na.rm = TRUE))
  
  df$variables<- merge(df_PBF$variables, 
                       df$variables, by = "ID_DOMICILIO")
  
  df$variables$per_capita_income <- df$variables$rend_dom / df$variables$n_people
  

  
  
  #Criteria for inclusion in the PBF:
  # families with a per capita income of up to BRL 89.00 or to families
  # with per capita income up to BRL 178, if they include children of teenagers

  
  df$variables<- df$variables %>% 
    mutate(PBF = case_when(
      per_capita_income <= 89 ~ 1,
      per_capita_income > 89 & per_capita_income <= 178 & n_children > 0 ~ 1,
      per_capita_income > 89 & per_capita_income <= 178 & n_teenager > 0 ~ 1,
      TRUE ~ 0
    ))
  
  df$variables$PBF_str<- ifelse(df$variables$PBF == 1, "PBF","No PBF")
  
  # CREATING CATEGORIES FOR THE TYPE OF JOBS #
  
  df<- update(df, V4019 = ifelse(is.na(V4019), 0, V4019))
 
  ## The code below generates occupation names considering
  #  the order of the boxplot. Larger number of leading spaces makes the group come first.
  df <- update(df,Type_job = ifelse(VD4009 %in% c("01", "03", "05", "06", "07"), 
                                     "     Formally Employed",
                                     ifelse(VD4009 %in% c("02", "04"), 
                                            "  Informally Employed",
                                            
                                            ifelse((V4012 == "5" & (V4019 == "1" 
                                                                    )), 
                                                   "       Formal Business Owner",
                                                   ifelse(V4012 == "5" & (V4019 != "1" 
                                                                          ), 
                                                          "    Informal Business Owner",
                                                          
                                                          ifelse((V4012 == "6" & (V4019 == "1" 
                                                                                  )), 
                                                                 "      Formally Self-employed",
                                                                 ifelse(V4012 == "6" & (V4019 != "1" 
                                                                                        ), 
                                                                        "   Informally Self-employed",
                                                                        
                                                                        ifelse(VD4009 == "10", 
                                                                               "Auxiliary Worker",
                                                                               
                                                                               "Others"))))))))

  # generate trimmed Type_job
  
  df <- update(df,Type_job_tr = trimws(Type_job))
  
  #Type of job aggregating informal categories
  df <- update(df,
               occupation = ifelse(Type_job_tr ==  "Formally Employed", 
                                        "formally employed",
                                        ifelse(Type_job_tr ==  "Formally Self-employed", 
                                               "formally self-employed",
                                               ifelse(Type_job_tr == "Formal Business Owner", 
                                                      "formal business owner",
                                                      ifelse(Type_job_tr == "Informally Employed" | 
                                                               Type_job_tr == "Informally Self-employed" | 
                                                               Type_job_tr == "Informal Business Owner", 
                                                             "informal",
                                                             "other")))))
  
  
  
  #generate the indicator of the population with age and education compatibility
  # exclude auxiliary workers and those without occupation
  
  df <- update(df,subpop= (  V2009 >= 23 & V2009 <= 25 & VD3004 >= 5
                           & Type_job_tr != "Auxiliary Worker" 
                           & !is.na(Type_job_tr)  
                            )
                          )
  #generate indicator for excluded PBF informal observations
  
  df <- update(df,excludePBFinformal=
                 (
                 !(PBF_str == "PBF" & 
                     trimws(Type_job_tr) == "Informally Employed") &
                 !(PBF_str == "PBF" & 
                     trimws(Type_job_tr) == "Informal Business Owner") &
                 !(PBF_str == "PBF" & 
                     trimws(Type_job_tr) == "Informally Self-employed"))
                    )

  #### Income estimates to merge with experiment data.----
  ## Compute mean inme by occupation classification, race, education, age and state
  
  #Mean total income: 
  rendimento_est <- as.data.frame(
    svyby(formula=~VD4020, 
          by=~occupation + State + Gender, 
          design= subset(df, subpop==1 & State!= "other" & !is.na(VD4020)),
          FUN=svymean, 
          na.rm=TRUE)
  )
  
  rendimento_est$se <- NULL
  
  
  
  #Mean total income excluding infomral PBF
  rendimento_est_no_PBF <- as.data.frame(
    svyby(formula=~VD4020, 
          by=~occupation + State + Gender, 
          design=  subset(df, subpop==1 & State!= "other" & !is.na(VD4020) & excludePBFinformal==1),
          FUN=svymean, 
          na.rm=TRUE)
  )
  rendimento_est_no_PBF$se <- NULL
  
  rendimento_est_no_PBF <- rendimento_est_no_PBF %>% rename(VD4020_no_PBF=VD4020)
  
  
  
  total_income_ReStat <- merge(as.data.frame(rendimento_est),
                               as.data.frame(rendimento_est_no_PBF), 
                               by = c("occupation", "State", "Gender"))
  
  write_xlsx(total_income_ReStat 
             , "income.xlsx")  
    

## Boxplot of income by occupation----
## REMOVING PBF FROM INFORMAL GROUPS 
  df_for_residuals_excludePBFinformal<- 
    subset(df, subpop==1 & !is.na(VD4020) & excludePBFinformal==1)
  
#Fitting the model at income level
model<- svyglm(formula = VD4020 ~ 
                  V2007 + #Gender
                  UF + #State
                  V2010 + #Race
                  VD3004, #Highest level of education achieved
                design = df_for_residuals_excludePBFinformal)

summary(model)

df_for_residuals_excludePBFinformal$variables$residuals_levels<- residuals(model)

#Box Plot Graph
png( #"model_residuals_box_plot_open_categories_without_PBF_filter_age_education_4_quarter_2019_levels.png", 
  "FigureA1.png",  
  width = 1440, height = 625)

svyboxplot(formula = residuals_levels ~ Type_job, 
           design = df_for_residuals_excludePBFinformal, all.outliers = TRUE)

residual_averages_without_lvl <- svyby(formula=~residuals_levels, 
                                   by=~Type_job, 
                                   df_for_residuals_excludePBFinformal,  
                                   FUN=svymean, 
                                   na.rm=TRUE)

text(x = 1:6, y = residual_averages_without_lvl$residuals, 
     labels = round(residual_averages_without_lvl$residuals, 2), 
     pos = 3, 
     col = "blue",
     cex = 0.95)

legend("topright", 
       legend = " Average residuals", 
       fill = "blue", 
       bty = "n",
       x.intersp = 0.01, y.intersp = 0.01)
dev.off()



